{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ef359482",
   "metadata": {},
   "source": [
    "# Collect house price data\n",
    "\n",
    "The best way to collect house price data is using [Zillow API](https://www.zillow.com/howto/api/APIOverview.htm).\n",
    "\n",
    "This notebook is an academic demonstration of how to crawl information from websites. The data used in this demo is a single page manually downloaded from [Trulia](https://www.trulia.com) and then uploaded to a dummy website hosted by AWS S3."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0512d74",
   "metadata": {},
   "source": [
    "## Import libraries "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da95028f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas\n",
    "import configparser\n",
    "import psycopg2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1d46390",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = configparser.ConfigParser()\n",
    "config.read('config.ini')\n",
    "\n",
    "host = config['myaws']['host']\n",
    "db = config['myaws']['db']\n",
    "user = config['myaws']['user']\n",
    "pwd = config['myaws']['pwd']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0994a681",
   "metadata": {},
   "outputs": [],
   "source": [
    "conn = psycopg2.connect(host = host,\n",
    "                       user = user,\n",
    "                        password = pwd,\n",
    "                        dbname = db\n",
    "                       )\n",
    "cur = conn.cursor()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a51ff48",
   "metadata": {},
   "source": [
    "## Create a table in database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa812386",
   "metadata": {},
   "outputs": [],
   "source": [
    "# replace the schema and table name to your schema and table name if necessary\n",
    "table_sql = \"\"\"\n",
    "            CREATE TABLE IF NOT EXISTS house\n",
    "            (\n",
    "\n",
    "                price integer,\n",
    "                bed integer,\n",
    "                bath integer,\n",
    "                area integer,\n",
    "                address VARCHAR(200),\n",
    "                PRIMARY KEY(address)\n",
    "            );\n",
    "\n",
    "            \"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ad18399",
   "metadata": {},
   "outputs": [],
   "source": [
    "# conn.rollback()\n",
    "# table_sql=\"drop table if exists house\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8162ea3",
   "metadata": {},
   "outputs": [],
   "source": [
    "cur.execute(table_sql)\n",
    "conn.commit()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6454bd95",
   "metadata": {},
   "source": [
    "## Define the URL"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4320408",
   "metadata": {},
   "source": [
    "Fill in the S3 website URL to the `url` variable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8a26435",
   "metadata": {},
   "outputs": [],
   "source": [
    "url = ''"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ca773fc",
   "metadata": {},
   "source": [
    "## Collect the research results "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ab0aebd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request\n",
    "response = urllib.request.urlopen(url)\n",
    "html_data= response.read()\n",
    "# print(html_data.decode('utf-8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d74bd6a5",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "soup = BeautifulSoup(html_data,'html.parser')\n",
    "# print (soup)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76b28a69",
   "metadata": {},
   "source": [
    "## Insert the records into database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4013b37e",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for ul in soup.find_all('ul'):\n",
    "    for li_class in ul.find_all('li'):\n",
    "        try:\n",
    "            for price_div in li_class.find_all('div',{'data-testid':'property-price'}):\n",
    "                price =int(price_div.text.replace('$','').replace(\",\",\"\"))\n",
    "            for bed_div in li_class.find_all('div', {'data-testid':'property-beds'}):\n",
    "                bed= int(bed_div.text.replace('bd','').replace(\",\",\"\"))\n",
    "            for bath_div in li_class.find_all('div',{'data-testid':'property-baths'}):\n",
    "                bath =int(bath_div.text.replace('ba','').replace(\",\",\"\"))\n",
    "            for area_div in li_class.find_all('div',{'data-testid':'property-floorSpace'}):\n",
    "                area=int(area_div.text.split('sqft')[0].replace(\",\",\"\"))\n",
    "            for address_div in li_class.find_all('div',{'data-testid':'property-address'}):\n",
    "                address =address_div.text\n",
    "            try:\n",
    "                sql_insert = \"\"\"\n",
    "                            insert into house(price,bed,bath,area,address)\n",
    "                            values('{}','{}','{}','{}','{}')            \n",
    "                            \"\"\".format(price,bed,bath,area,address)\n",
    "\n",
    "                cur.execute(sql_insert)\n",
    "                conn.commit()\n",
    "            except:\n",
    "                conn.rollback()\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9ce2145",
   "metadata": {},
   "source": [
    "## Query the table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "627fb4e1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = pandas.read_sql_query('select * from house ', conn)\n",
    "df[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10b760b9",
   "metadata": {},
   "source": [
    "## Basic statistics "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95d26035",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0be021bc",
   "metadata": {},
   "source": [
    "## Price distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd60d1aa",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df['price'].hist()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
